Linden 2016
Import simulation results
dataname <- "Linden_2015"
# import simulation results
sim_results <- readRDS(paste0("sim_results/simulation_results_final_", dataname,"_", n_rep, ".rds"))
# define classifiers
classifiers <- names(sim_results)
classifiers <- classifiers[!(classifiers %in% c("outcome", "true_e"))]
classifiers_clean <- c("adaBoost", "bagging" ,"kNN", "LDA", "Multinimial logit (glmnet)", "Multinimial logit (nnet)", "MLPC", "Naive Bayes (Bernulli)", "Naive Bayes (Gaussian)", "Probability forest (grf)", "Probability forest (ranger)", "QDA", "SVM", "XGBoost")
Compute average evaluation metrics
The predicted propensity score are evaluated using the Brier score,
the RMSE and the logistic loss.
bs_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
ll_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
rmse_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
bs_mat_oracle <- matrix(NA, nrow=n_rep, ncol = 1)
n=length(sim_results$outcome[[1]]$W)
for (i in 1:n_rep){
bs_mat_oracle[i] <- brier_score(probabilities=sim_results[["true_e"]][[i]],
outcome=sim_results[["outcome"]][[i]]$W,
binary = FALSE)
for (c in seq_along(classifiers)) {
classifier <- classifiers[c]
bs_mat[i, c] <- brier_score(probabilities=sim_results[[classifier]][[i]],
outcome=sim_results[["outcome"]][[i]]$W,
binary = FALSE)
rmse_mat[i, c] <- sqrt(mean(rowMeans((sim_results[[classifier]][[i]] - sim_results[["true_e"]][[i]])^2, na.rm = TRUE), na.rm = TRUE))
ll_mat[i, c] <- cross_entropy(sim_results[[classifier]][[i]], sim_results[["outcome"]][[i]]$W)
}
}
Reshape and aggregate results
- Brier score
bs_long <- bs_mat %>%
as.data.frame() %>%
pivot_longer(cols = everything()) %>%
# Separate the 'name' into 'classifier' and 'version' columns
mutate(version = case_when(
str_detect(name, "^ovr_") ~ "one-vs-rest",
str_detect(name, "^ovo_") ~ "one-vs-one",
TRUE ~ "multi-class"
),
name = gsub("ovr_|ovo_", "", name)
)
bs_wide <- bs_long %>%
group_by(version, name) %>%
summarise(value = round(mean(value), 6)) %>%
spread(key = version, value = value)
`summarise()` has grouped output by 'version'. You can override using the `.groups` argument.
kable(bs_wide, caption = "Brier Score")
Brier Score
| adaboost |
NA |
0.231426 |
0.233096 |
| bagging |
NA |
0.236621 |
0.243976 |
| knn |
0.225868 |
0.225448 |
0.218266 |
| lda |
0.213472 |
0.213510 |
0.213479 |
| logit |
0.213337 |
0.247198 |
0.213417 |
| logit_nnet |
0.213429 |
0.213533 |
0.213475 |
| mlpc |
0.213293 |
0.216825 |
0.216747 |
| nb_bernulli |
0.214456 |
0.214547 |
0.214486 |
| nb_gaussian |
0.214450 |
0.214495 |
0.214475 |
| probability_forest |
0.216493 |
0.215474 |
0.216540 |
| qda |
0.226034 |
0.226233 |
0.224151 |
| ranger |
0.235241 |
0.217772 |
0.217983 |
| svm |
NA |
0.220312 |
0.220447 |
| xgboost |
NA |
0.229347 |
0.218846 |
- RMSE
rmse_long <- rmse_mat %>%
as.data.frame() %>%
pivot_longer(cols = everything()) %>%
# Separate the 'name' into 'classifier' and 'version' columns
mutate(version = case_when(
str_detect(name, "^ovr_") ~ "one-vs-rest",
str_detect(name, "^ovo_") ~ "one-vs-one",
TRUE ~ "multi-class"
),
name = gsub("ovr_|ovo_", "", name)
)
rmse_wide <- rmse_long %>%
group_by(version, name) %>%
summarise(value = round(mean(value), 4)) %>%
spread(key = version, value = value)
`summarise()` has grouped output by 'version'. You can override using the `.groups` argument.
kable(rmse_wide, caption = "RMSE")
RMSE
| adaboost |
NA |
0.1366 |
0.1431 |
| bagging |
NA |
0.1543 |
0.1781 |
| knn |
0.1127 |
0.1103 |
0.0753 |
| lda |
0.0298 |
0.0299 |
0.0295 |
| logit |
0.0272 |
0.1857 |
0.0281 |
| logit_nnet |
0.0286 |
0.0297 |
0.0293 |
| mlpc |
0.0269 |
0.0650 |
0.0646 |
| nb_bernulli |
0.0427 |
0.0439 |
0.0429 |
| nb_gaussian |
0.0428 |
0.0439 |
0.0428 |
| probability_forest |
0.0623 |
0.0535 |
0.0628 |
| qda |
0.1157 |
0.1163 |
0.1063 |
| ranger |
0.1505 |
0.0720 |
0.0731 |
| svm |
NA |
0.0883 |
0.0891 |
| xgboost |
NA |
0.1290 |
0.0787 |
- Logistic loss
ll_long <- ll_mat %>%
as.data.frame() %>%
pivot_longer(cols = everything()) %>%
# Separate the 'name' into 'classifier' and 'version' columns
mutate(version = case_when(
str_detect(name, "^ovr_") ~ "one-vs-rest",
str_detect(name, "^ovo_") ~ "one-vs-one",
TRUE ~ "multi-class"
),
name = gsub("ovr_|ovo_", "", name)
)
ll_wide <- ll_long %>%
group_by(version, name) %>%
summarise(value = round(mean(value), 4)) %>%
spread(key = version, value = value)
`summarise()` has grouped output by 'version'. You can override using the `.groups` argument.
kable(ll_wide, caption = "Log Loss")
Log Loss
| adaboost |
NA |
1.1482 |
1.3000 |
| bagging |
NA |
1.1909 |
1.7804 |
| knn |
1.7681 |
NA |
1.0858 |
| lda |
1.0605 |
1.0608 |
1.0605 |
| logit |
1.0600 |
1.2253 |
1.0604 |
| logit_nnet |
1.0604 |
1.0609 |
1.0607 |
| mlpc |
1.0597 |
1.0819 |
1.0946 |
| nb_bernulli |
1.0653 |
1.0658 |
1.0657 |
| nb_gaussian |
1.0653 |
1.0655 |
1.0656 |
| probability_forest |
1.0751 |
1.0697 |
1.0754 |
| qda |
1.1820 |
NA |
1.1716 |
| ranger |
1.1746 |
1.0809 |
1.0828 |
| svm |
NA |
1.0905 |
1.0909 |
| xgboost |
NA |
1.1314 |
1.0837 |
Export the final results.
# Write bs_wide data frame as an Excel file
write.xlsx(bs_wide, file = paste0("sim_results/results_bs_", dataname, ".xlsx"))
# Write mse_wide data frame as an Excel file
write.xlsx(rmse_wide, file = paste0("sim_results/results_mse_", dataname, ".xlsx"))
Visualization
For the synthetic datasets, the true propensity scores e are known
and can be utilized to establish a benchmark for evaluating the Brier
score. This reference Brier score calculated using the ground truth e is
denoted as BS_oracle. BS_oracle has perfect information of the data and
can be seen as a lower bound for the Brier score. A second benchmark for
the Brier score is a naïve guess, using equal treatment propensity
scores of e = 1/T for every treatment level denoted as BS_naive.
K <- length(unique(sim_results$outcome[[1]]$W)) # number of treatments
BS_naive <- (((1/K)-1)^2 + (K-1)*((1/K)^2))/K # BS_naive
BS_oracle <- mean(bs_mat_oracle) # BS_oracle
plot1 <- ggplot(bs_long, aes(x = value, y = fct_rev(name), fill = version)) +
geom_density_ridges(alpha = 0.9, show.legend = FALSE) +
geom_vline(xintercept = BS_naive, color="black", linetype = "dashed", size = 0.4) +
geom_vline(xintercept = BS_oracle, color="black", linetype = "dashed", size = 0.4) +
scale_fill_paletteer_d("tayloRswift::midnightsBloodMoon", direction = 1, dynamic = FALSE) +
labs(y="", x="Brier Score", fill="") +
theme_bw() +
xlim(0.205, 0.25) +
scale_y_discrete(labels = rev(classifiers_clean)) +
facet_wrap(vars(version))
plot1
ggsave(plot1, width = 250, height = 125, units = "mm",
filename = "plots/dens1.png")

Another way to assess the quality of the predicted propensity scores
is to plot them against the ground truth in a scatter plot.
oracle_e <- reshape2::melt(sim_results[["true_e"]][[1]], id.vars = NULL, variable.name = "class", value.name = "true")
for (c in classifiers){
predicted_e <- reshape2::melt(sim_results[[c]][[1]], id.vars = NULL, variable.name = "class", value.name = "prediction") %>% select(-class)
plot_data <- cbind(predicted_e, oracle_e)
scatter_plot <- ggplot(data = plot_data,aes(y = prediction, x = true, color=class)) +
geom_point(alpha = 0.3, size=1) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
theme_bw() +
scale_color_manual(values=c("#D890A0FF", "#282020FF", "#F8B840FF")) +
labs(
x = "Predicted Probabilities ê(w)",
y = "True Probailities e(w)",
color = "W"
#title = paste0("Simulation results of ", c)
) +
xlim(0,1) + ylim(0,1)
plot(scatter_plot)
ggsave(scatter_plot, width = 125, height = 75, units = "mm",
filename = paste0("plots/truevspred_", c, ".png"))
}






































Imbens 2016
Import simulation results
dataname <- "Imbens_2016"
sim_results <- readRDS(paste0("sim_results/simulation_results_final_", dataname,"_", n_rep, ".rds"))
Compute average evaluation metrics
The predicted propensity score are evaluated using the Brier score,
the RMSE and the logistic loss.
bs_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
ll_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
rmse_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
bs_mat_oracle <- matrix(NA, nrow=n_rep, ncol = 1)
n=length(sim_results$outcome[[1]]$W)
for (i in 1:n_rep){
bs_mat_oracle[i] <- brier_score(probabilities=sim_results[["true_e"]][[i]],
outcome=sim_results[["outcome"]][[i]]$W,
binary = FALSE)
for (c in seq_along(classifiers)) {
classifier <- classifiers[c]
bs_mat[i, c] <- brier_score(probabilities=sim_results[[classifier]][[i]],
outcome=sim_results[["outcome"]][[i]]$W,
binary = FALSE)
rmse_mat[i, c] <- sqrt(mean(rowMeans((sim_results[[classifier]][[i]] - sim_results[["true_e"]][[i]])^2, na.rm = TRUE), na.rm = TRUE))
ll_mat[i, c] <- cross_entropy(sim_results[[classifier]][[i]], sim_results[["outcome"]][[i]]$W)
}
}
Reshape and aggregate results
- Brier score
bs_long <- bs_mat %>%
as.data.frame() %>%
pivot_longer(cols = everything()) %>%
# Separate the 'name' into 'classifier' and 'version' columns
mutate(version = case_when(
str_detect(name, "^ovr_") ~ "one-vs-rest",
str_detect(name, "^ovo_") ~ "one-vs-one",
TRUE ~ "multi-class"
),
name = gsub("ovr_|ovo_", "", name)
)
bs_wide <- bs_long %>%
group_by(version, name) %>%
summarise(value = round(mean(value), 4)) %>%
spread(key = version, value = value)
`summarise()` has grouped output by 'version'. You can override using the `.groups` argument.
kable(bs_wide, caption = "Brier Score")
Brier Score
| adaboost |
NA |
0.2320 |
0.2327 |
| bagging |
NA |
0.2325 |
0.2368 |
| knn |
0.2495 |
0.2481 |
0.2265 |
| lda |
0.2218 |
0.2219 |
0.2219 |
| logit |
0.2218 |
0.2287 |
0.2217 |
| logit_nnet |
0.2218 |
0.2220 |
0.2219 |
| mlpc |
0.2235 |
0.2358 |
0.2344 |
| nb_bernulli |
0.2257 |
0.2257 |
0.2249 |
| nb_gaussian |
0.2256 |
0.2260 |
0.2249 |
| probability_forest |
0.2213 |
0.2212 |
0.2214 |
| qda |
0.2260 |
0.2262 |
0.2255 |
| ranger |
0.2275 |
0.2234 |
0.2232 |
| svm |
NA |
0.2231 |
0.2229 |
| xgboost |
NA |
0.2252 |
0.2216 |
- RMSE
rmse_long <- rmse_mat %>%
as.data.frame() %>%
pivot_longer(cols = everything()) %>%
# Separate the 'name' into 'classifier' and 'version' columns
mutate(version = case_when(
str_detect(name, "^ovr_") ~ "one-vs-rest",
str_detect(name, "^ovo_") ~ "one-vs-one",
TRUE ~ "multi-class"
),
name = gsub("ovr_|ovo_", "", name)
)
rmse_wide <- rmse_long %>%
group_by(version, name) %>%
summarise(value = round(mean(value), 4)) %>%
spread(key = version, value = value)
`summarise()` has grouped output by 'version'. You can override using the `.groups` argument.
kable(rmse_wide, caption = "RMSE")
RMSE
| adaboost |
NA |
0.1255 |
0.1287 |
| bagging |
NA |
0.1306 |
0.1440 |
| knn |
0.1735 |
0.1755 |
0.1057 |
| lda |
0.0735 |
0.0743 |
0.0739 |
| logit |
0.0727 |
0.1123 |
0.0730 |
| logit_nnet |
0.0734 |
0.0744 |
0.0739 |
| mlpc |
0.0842 |
0.1396 |
0.1348 |
| nb_bernulli |
0.1038 |
0.1045 |
0.0990 |
| nb_gaussian |
0.1036 |
0.1049 |
0.0991 |
| probability_forest |
0.0702 |
0.0689 |
0.0706 |
| qda |
0.1041 |
0.1054 |
0.1008 |
| ranger |
0.1062 |
0.0837 |
0.0847 |
| svm |
NA |
0.0802 |
0.0788 |
| xgboost |
NA |
0.0938 |
0.0734 |
- Logistic loss
ll_long <- ll_mat %>%
as.data.frame() %>%
pivot_longer(cols = everything()) %>%
# Separate the 'name' into 'classifier' and 'version' columns
mutate(version = case_when(
str_detect(name, "^ovr_") ~ "one-vs-rest",
str_detect(name, "^ovo_") ~ "one-vs-one",
TRUE ~ "multi-class"
),
name = gsub("ovr_|ovo_", "", name)
)
ll_wide <- ll_long %>%
group_by(version, name) %>%
summarise(value = round(mean(value), 4)) %>%
spread(key = version, value = value)
`summarise()` has grouped output by 'version'. You can override using the `.groups` argument.
kable(ll_wide, caption = "Log Loss")
Log Loss
| adaboost |
NA |
1.1462 |
1.2067 |
| bagging |
NA |
1.1505 |
1.2421 |
| knn |
4.2125 |
NA |
1.1212 |
| lda |
1.0968 |
1.0973 |
1.0973 |
| logit |
1.0965 |
1.1282 |
1.0964 |
| logit_nnet |
1.0967 |
1.0973 |
1.0970 |
| mlpc |
1.1109 |
1.1692 |
1.5063 |
| nb_bernulli |
1.1351 |
NA |
1.1296 |
| nb_gaussian |
1.1349 |
1.1488 |
1.1295 |
| probability_forest |
1.0945 |
1.0940 |
1.0951 |
| qda |
1.1377 |
1.1509 |
1.1340 |
| ranger |
1.1238 |
1.1043 |
1.1039 |
| svm |
NA |
1.1024 |
1.1018 |
| xgboost |
NA |
1.1123 |
1.0958 |
Export the final results.
# Write bs_wide data frame as an Excel file
write.xlsx(bs_wide, file = paste0("sim_results/results_bs_", dataname, ".xlsx"))
# Write mse_wide data frame as an Excel file
write.xlsx(rmse_wide, file = paste0("sim_results/results_mse_", dataname, ".xlsx"))
Visualization
K <- length(unique(sim_results$outcome[[1]]$W))
BS_naive <- (((1/K)-1)^2 + (K-1)*((1/K)^2))/K
LL_unif <- -(log(1/K) + log(1-(1/K)))
BS_oracle <- mean(bs_mat_oracle)
plot2 <- ggplot(bs_long, aes(x = value, y = fct_rev(name), fill = version)) +
geom_density_ridges(alpha = 0.9, show.legend = FALSE) +
geom_vline(xintercept = BS_naive, color="black", linetype = "dashed") +
geom_vline(xintercept = BS_oracle, color="black", linetype = "dashed") +
scale_fill_paletteer_d("tayloRswift::midnightsBloodMoon", direction = 1, dynamic = FALSE) +
labs(y="", x="Brier Score", fill="") +
theme_bw() +
xlim(0.215, 0.25) +
scale_y_discrete(labels = rev(classifiers_clean)) +
facet_wrap(vars(version))
plot2
ggsave(plot2, width = 250, height = 125, units = "mm",
filename = "plots/dens2.png")

pi <- sim_results$true_e[[1]]
bs_oracle <- brier_score(probabilities = pi, outcome = sim_results[["outcome"]][[1]]$W)
ll_oracle <- cross_entropy(probabilities = pi, outcome = sim_results[["outcome"]][[1]]$W)
predicted_e <- reshape2::melt(sim_results[[c]][[1]], id.vars = NULL, variable.name = "class", value.name = "prediction")
for (c in classifiers){
oracle_e <- reshape2::melt(pi, id.vars = NULL, variable.name = "class", value.name = "true") %>% select(true)
plot_data <- cbind(predicted_e, oracle_e)
scatter_plot <- ggplot(data = plot_data,aes(y = prediction, x = true, color=class)) +
geom_point(alpha = 0.3) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
geom_smooth(color="red", se = FALSE, linewidth = 0.7)+
theme_bw() +
scale_color_paletteer_d("tayloRswift::midnightsBloodMoon", direction = 1, dynamic = FALSE) +
labs(
x = "Predicted Probabilities",
y = "True Probailities",
title = paste0("Simulation results of ", c)
) +
xlim(0,1) + ylim(0,1)
plot(scatter_plot)
}






































Acharky 2023
Import simulation results
dataname <- "Acharky_2023"
n_rep <- 100
sim_results <- readRDS(paste0("sim_results/simulation_results_ovo_", dataname,"_", n_rep, ".rds"))
Compute average evaluation metrics
The predicted propensity score are evaluated using the Brier score
and the logistic loss. As for the semi-synthetic dataset the ground
truth of the propensity score is not know, the RMSE cannot be
calculated.
bs_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
ll_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
n=length(sim_results$outcome[[1]]$W)
for (c in seq_along(classifiers)) {
classifier <- classifiers[c]
for (i in 1:n_rep){
bs <- brier_score(sim_results[[classifier]][[i]], sim_results[["outcome"]][[i]]$W, binary = FALSE)
bs_mat[i, c] <- bs
ll <- cross_entropy(sim_results[[classifier]][[i]], sim_results[["outcome"]][[i]]$W)
ll_mat[i, c] <- ll
}
}
Reshape and aggregate results
- Brier score
bs_long <- bs_mat %>%
as.data.frame() %>%
pivot_longer(cols = everything()) %>%
# Separate the 'name' into 'classifier' and 'version' columns
mutate(version = case_when(
str_detect(name, "^ovr_") ~ "one-vs-rest",
str_detect(name, "^ovo_") ~ "one-vs-one",
TRUE ~ "multi-class"
),
name = gsub("ovr_|ovo_", "", name)
)
bs_wide <- bs_long %>%
group_by(version, name) %>%
summarise(value = round(mean(value), 4)) %>%
spread(key = version, value = value)
`summarise()` has grouped output by 'version'. You can override using the `.groups` argument.
kable(bs_wide, caption = "Brier Score")
Brier Score
| adaboost |
NA |
0.0719 |
0.0732 |
| bagging |
NA |
0.0721 |
0.0807 |
| knn |
0.1132 |
0.0750 |
0.0778 |
| lda |
0.0712 |
0.0712 |
0.0712 |
| logit |
0.0712 |
0.0712 |
0.0712 |
| logit_nnet |
0.0711 |
0.0712 |
0.0712 |
| mlpc |
0.0710 |
0.0710 |
0.0710 |
| nb_bernulli |
0.0713 |
0.0713 |
0.0713 |
| nb_gaussian |
0.0713 |
0.0713 |
0.0713 |
| probability_forest |
0.0715 |
0.0711 |
0.0715 |
| qda |
0.0719 |
0.0720 |
0.0720 |
| ranger |
0.0727 |
0.0712 |
0.0712 |
| svm |
NA |
0.0710 |
0.0711 |
| xgboost |
NA |
0.0711 |
0.0780 |
- Logistic loss
ll_long <- ll_mat %>%
as.data.frame() %>%
pivot_longer(cols = everything()) %>%
# Separate the 'name' into 'classifier' and 'version' columns
mutate(version = case_when(
str_detect(name, "^ovr_") ~ "one-vs-rest",
str_detect(name, "^ovo_") ~ "one-vs-one",
TRUE ~ "multi-class"
),
name = gsub("ovr_|ovo_", "", name)
)
ll_wide <- ll_long %>%
group_by(version, name) %>%
summarise(value = round(mean(value), 4)) %>%
spread(key = version, value = value)
`summarise()` has grouped output by 'version'. You can override using the `.groups` argument.
kable(ll_wide, caption = "Log Loss")
Log Loss
| adaboost |
NA |
2.6336 |
4.0974 |
| bagging |
NA |
2.6560 |
20.4284 |
| knn |
36.5656 |
2.8852 |
13.3819 |
| lda |
2.5806 |
2.5814 |
2.5810 |
| logit |
2.5781 |
2.5801 |
2.5788 |
| logit_nnet |
2.5727 |
2.5816 |
2.5808 |
| mlpc |
2.5666 |
2.5666 |
2.5668 |
| nb_bernulli |
2.5914 |
2.5925 |
2.5927 |
| nb_gaussian |
2.5910 |
2.5927 |
2.5928 |
| probability_forest |
2.6062 |
2.5729 |
2.6073 |
| qda |
2.6460 |
2.6512 |
2.6507 |
| ranger |
2.6948 |
2.5816 |
2.5822 |
| svm |
NA |
2.5676 |
2.5693 |
| xgboost |
NA |
2.5697 |
3.1047 |
Export the final results.
# Write bs_wide data frame as an Excel file
write.xlsx(bs_wide, file = paste0("sim_results/results_bs_", dataname, ".xlsx"))
Visualization
K <- length(unique(sim_results$outcome[[1]]$W))
BS_oracle <- (((1/K)-1)^2 + (K-1)*((1/K)^2))/K
LL_unif <- -(log(1/K) + log(1-(1/K)))
plot3 <- ggplot(bs_long, aes(x = value, y = fct_rev(name), fill = version)) +
geom_density_ridges(alpha = 0.9, show.legend = FALSE) +
geom_vline(xintercept = BS_oracle, color="black", linetype = "dashed") +
scale_fill_paletteer_d("tayloRswift::midnightsBloodMoon", direction = 1, dynamic = FALSE) +
labs(y="", x="Brier Score", fill="") +
theme_bw() +
scale_x_continuous(breaks = c(0.071, 0.0715, 0.072), limits = c(0.0708, 0.0721)) +
scale_y_discrete(labels = rev(classifiers_clean)) +
facet_wrap(vars(version))
plot3
ggsave(plot3, width = 250, height = 125, units = "mm",
filename = "plots/dens3.png")

for (c in classifiers){
plot(make_calibtation_plot(probabilities = sim_results[[c]][[1]], outcome = sim_results$outcome[[1]]$W, method = c))
}
`summarise()` has grouped output by 'probability_bin'. You can override using the `.groups` argument.






































---
title: "Evaluation of Simulation results"
author: "Maren Baumgärtner"
output: 
  html_notebook:
    toc: true
    toc_float: true
    code_folding: show
---

```{r, message=FALSE, echo=FALSE, warning=FALSE}
set.seed(42) # set seed
options(scipen= 999) # prevent scientific notation
source("packages.R") # load packages
source("eval_functions.R") # import evaluation functions

n_rep <- 100 # define number of iterations
```

# Linden 2016

### Import simulation results

```{r}
dataname <- "Linden_2015"

# import simulation results
sim_results <- readRDS(paste0("sim_results/simulation_results_final_", dataname,"_", n_rep, ".rds"))

# define classifiers
classifiers <- names(sim_results)
classifiers <- classifiers[!(classifiers %in% c("outcome", "true_e"))]
classifiers_clean <- c("adaBoost", "bagging" ,"kNN", "LDA", "Multinimial logit (glmnet)", "Multinimial logit (nnet)", "MLPC", "Naive Bayes (Bernulli)", "Naive Bayes (Gaussian)", "Probability forest (grf)", "Probability forest (ranger)", "QDA", "SVM", "XGBoost")
```

### Compute average evaluation metrics

The predicted propensity score are evaluated using the Brier score, the RMSE and the logistic loss.

```{r}
bs_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
ll_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
rmse_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))

bs_mat_oracle <- matrix(NA, nrow=n_rep, ncol = 1)

n=length(sim_results$outcome[[1]]$W)
for (i in 1:n_rep){
  bs_mat_oracle[i] <- brier_score(probabilities=sim_results[["true_e"]][[i]], 
                      outcome=sim_results[["outcome"]][[i]]$W, 
                      binary = FALSE)

  for (c in seq_along(classifiers)) {
    classifier <- classifiers[c]

    bs_mat[i, c] <- brier_score(probabilities=sim_results[[classifier]][[i]], 
                      outcome=sim_results[["outcome"]][[i]]$W, 
                      binary = FALSE)
    
    rmse_mat[i, c] <- sqrt(mean(rowMeans((sim_results[[classifier]][[i]] - sim_results[["true_e"]][[i]])^2, na.rm = TRUE), na.rm = TRUE))

    ll_mat[i, c] <- cross_entropy(sim_results[[classifier]][[i]], sim_results[["outcome"]][[i]]$W)
  }
}
```

### Reshape and aggregate results

1.  Brier score

```{r}
bs_long <- bs_mat %>%
  as.data.frame() %>%
  pivot_longer(cols = everything())  %>%
  # Separate the 'name' into 'classifier' and 'version' columns
  mutate(version = case_when(
           str_detect(name, "^ovr_") ~ "one-vs-rest",
           str_detect(name, "^ovo_") ~ "one-vs-one",
           TRUE ~ "multi-class"
         ),
         name = gsub("ovr_|ovo_", "", name)
         )
bs_wide <- bs_long %>%
  group_by(version, name) %>% 
  summarise(value = round(mean(value), 635)) %>% 
  spread(key = version, value = value) 

kable(bs_wide, caption = "Brier Score")
```

2.  RMSE

```{r}
rmse_long <- rmse_mat %>%
  as.data.frame() %>%
  pivot_longer(cols = everything())  %>%
  # Separate the 'name' into 'classifier' and 'version' columns
  mutate(version = case_when(
           str_detect(name, "^ovr_") ~ "one-vs-rest",
           str_detect(name, "^ovo_") ~ "one-vs-one",
           TRUE ~ "multi-class"
         ),
         name = gsub("ovr_|ovo_", "", name)
         ) 
rmse_wide <- rmse_long %>%
  group_by(version, name) %>% 
  summarise(value = round(mean(value), 4)) %>% 
  spread(key = version, value = value) 

kable(rmse_wide, caption = "RMSE")
```

3.  Logistic loss

```{r}
ll_long <- ll_mat %>%
  as.data.frame() %>%
  pivot_longer(cols = everything())  %>%
  # Separate the 'name' into 'classifier' and 'version' columns
  mutate(version = case_when(
           str_detect(name, "^ovr_") ~ "one-vs-rest",
           str_detect(name, "^ovo_") ~ "one-vs-one",
           TRUE ~ "multi-class"
         ),
         name = gsub("ovr_|ovo_", "", name)
         ) 
ll_wide <- ll_long %>%
  group_by(version, name) %>% 
  summarise(value = round(mean(value), 4)) %>% 
  spread(key = version, value = value) 

kable(ll_wide, caption = "Log Loss")
```

Export the final results.

```{r}
# Write bs_wide data frame as an Excel file
write.xlsx(bs_wide, file = paste0("sim_results/results_bs_", dataname, ".xlsx"))

# Write mse_wide data frame as an Excel file
write.xlsx(rmse_wide, file = paste0("sim_results/results_mse_", dataname, ".xlsx"))

```

### Visualization

For the synthetic datasets, the true propensity scores e are known and can be utilized to establish a benchmark for evaluating the Brier score. This reference Brier score calculated using the ground truth e is denoted as BS_oracle. BS_oracle has perfect information of the data and can be seen as a lower bound for the Brier score. A second benchmark for the Brier score is a naïve guess, using equal treatment propensity scores of e = 1/T for every treatment level denoted as BS_naive.

```{r}
K <- length(unique(sim_results$outcome[[1]]$W)) # number of treatments
BS_naive <- (((1/K)-1)^2 + (K-1)*((1/K)^2))/K # BS_naive
BS_oracle <- mean(bs_mat_oracle) # BS_oracle
  
plot1 <- ggplot(bs_long, aes(x = value, y = fct_rev(name), fill = version)) +
  geom_density_ridges(alpha = 0.9, show.legend = FALSE) +
  geom_vline(xintercept = BS_naive, color="black", linetype = "dashed", size = 0.4) +
  geom_vline(xintercept = BS_oracle, color="black", linetype = "dashed", size = 0.4) +
  scale_fill_paletteer_d("tayloRswift::midnightsBloodMoon", direction = 1, dynamic = FALSE) +
  labs(y="", x="Brier Score", fill="") +
  theme_bw() +
  xlim(0.205, 0.25) +
  scale_y_discrete(labels = rev(classifiers_clean)) +
  facet_wrap(vars(version))
plot1
ggsave(plot1, width = 250, height = 125, units = "mm",
       filename = "plots/dens1.png")

```

Another way to assess the quality of the predicted propensity scores is to plot them against the ground truth in a scatter plot.

```{r}
oracle_e <- reshape2::melt(sim_results[["true_e"]][[1]], id.vars = NULL, variable.name = "class", value.name = "true")
for (c in classifiers){
  predicted_e <- reshape2::melt(sim_results[[c]][[1]], id.vars = NULL, variable.name = "class", value.name = "prediction") %>% select(-class)

  plot_data <- cbind(predicted_e, oracle_e)
  
  scatter_plot <- ggplot(data = plot_data,aes(y = prediction, x = true, color=class)) +
    geom_point(alpha = 0.3, size=1) +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") + 
    theme_bw() +
    scale_color_manual(values=c("#D890A0FF", "#282020FF", "#F8B840FF")) +
    labs(
          x = "Predicted Probabilities ê(w)",
          y = "True Probailities e(w)",
          color = "W"
          #title = paste0("Simulation results of ", c)
        ) +
    xlim(0,1) + ylim(0,1)
  
  plot(scatter_plot)
  ggsave(scatter_plot, width = 125, height = 75, units = "mm",
         filename = paste0("plots/truevspred_", c, ".png"))
}
```

# Imbens 2016

### Import simulation results

```{r}
dataname <- "Imbens_2016"

sim_results <- readRDS(paste0("sim_results/simulation_results_final_", dataname,"_", n_rep, ".rds"))
```

### Compute average evaluation metrics

The predicted propensity score are evaluated using the Brier score, the RMSE and the logistic loss.

```{r}
bs_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
ll_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
rmse_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))

bs_mat_oracle <- matrix(NA, nrow=n_rep, ncol = 1)

n=length(sim_results$outcome[[1]]$W)
for (i in 1:n_rep){
  bs_mat_oracle[i] <- brier_score(probabilities=sim_results[["true_e"]][[i]], 
                      outcome=sim_results[["outcome"]][[i]]$W, 
                      binary = FALSE)

  for (c in seq_along(classifiers)) {
    classifier <- classifiers[c]

    bs_mat[i, c] <- brier_score(probabilities=sim_results[[classifier]][[i]], 
                      outcome=sim_results[["outcome"]][[i]]$W, 
                      binary = FALSE)
    
    rmse_mat[i, c] <- sqrt(mean(rowMeans((sim_results[[classifier]][[i]] - sim_results[["true_e"]][[i]])^2, na.rm = TRUE), na.rm = TRUE))

    ll_mat[i, c] <- cross_entropy(sim_results[[classifier]][[i]], sim_results[["outcome"]][[i]]$W)
  }
}
```

### Reshape and aggregate results

1.  Brier score

```{r}
bs_long <- bs_mat %>%
  as.data.frame() %>%
  pivot_longer(cols = everything())  %>%
  # Separate the 'name' into 'classifier' and 'version' columns
  mutate(version = case_when(
           str_detect(name, "^ovr_") ~ "one-vs-rest",
           str_detect(name, "^ovo_") ~ "one-vs-one",
           TRUE ~ "multi-class"
         ),
         name = gsub("ovr_|ovo_", "", name)
         )
bs_wide <- bs_long %>%
  group_by(version, name) %>% 
  summarise(value = round(mean(value), 4)) %>% 
  spread(key = version, value = value) 

kable(bs_wide, caption = "Brier Score")
```

2.  RMSE

```{r}
rmse_long <- rmse_mat %>%
  as.data.frame() %>%
  pivot_longer(cols = everything())  %>%
  # Separate the 'name' into 'classifier' and 'version' columns
  mutate(version = case_when(
           str_detect(name, "^ovr_") ~ "one-vs-rest",
           str_detect(name, "^ovo_") ~ "one-vs-one",
           TRUE ~ "multi-class"
         ),
         name = gsub("ovr_|ovo_", "", name)
         ) 
rmse_wide <- rmse_long %>%
  group_by(version, name) %>% 
  summarise(value = round(mean(value), 4)) %>% 
  spread(key = version, value = value) 

kable(rmse_wide, caption = "RMSE")
```

3.  Logistic loss

```{r}
ll_long <- ll_mat %>%
  as.data.frame() %>%
  pivot_longer(cols = everything())  %>%
  # Separate the 'name' into 'classifier' and 'version' columns
  mutate(version = case_when(
           str_detect(name, "^ovr_") ~ "one-vs-rest",
           str_detect(name, "^ovo_") ~ "one-vs-one",
           TRUE ~ "multi-class"
         ),
         name = gsub("ovr_|ovo_", "", name)
         ) 
ll_wide <- ll_long %>%
  group_by(version, name) %>% 
  summarise(value = round(mean(value), 4)) %>% 
  spread(key = version, value = value) 

kable(ll_wide, caption = "Log Loss")
```

Export the final results.

```{r}
# Write bs_wide data frame as an Excel file
write.xlsx(bs_wide, file = paste0("sim_results/results_bs_", dataname, ".xlsx"))

# Write mse_wide data frame as an Excel file
write.xlsx(rmse_wide, file = paste0("sim_results/results_mse_", dataname, ".xlsx"))

```

### Visualization

```{r}
K <- length(unique(sim_results$outcome[[1]]$W))
BS_naive <- (((1/K)-1)^2 + (K-1)*((1/K)^2))/K
LL_unif <- -(log(1/K) + log(1-(1/K)))
BS_oracle <- mean(bs_mat_oracle)
  
plot2 <- ggplot(bs_long, aes(x = value, y = fct_rev(name), fill = version)) +
  geom_density_ridges(alpha = 0.9, show.legend = FALSE) +
  geom_vline(xintercept = BS_naive, color="black", linetype = "dashed") +
  geom_vline(xintercept = BS_oracle, color="black", linetype = "dashed") +
  scale_fill_paletteer_d("tayloRswift::midnightsBloodMoon", direction = 1, dynamic = FALSE) +
  labs(y="", x="Brier Score", fill="") +
  theme_bw() +
  xlim(0.215, 0.25) +
  scale_y_discrete(labels = rev(classifiers_clean)) +
  facet_wrap(vars(version))
plot2
ggsave(plot2, width = 250, height = 125, units = "mm",
       filename = "plots/dens2.png")

```

```{r}
pi <- sim_results$true_e[[1]]

bs_oracle <- brier_score(probabilities = pi, outcome = sim_results[["outcome"]][[1]]$W)
ll_oracle <- cross_entropy(probabilities = pi, outcome = sim_results[["outcome"]][[1]]$W)

predicted_e <- reshape2::melt(sim_results[[c]][[1]], id.vars = NULL, variable.name = "class", value.name = "prediction")

for (c in classifiers){
  oracle_e <- reshape2::melt(pi, id.vars = NULL, variable.name = "class", value.name = "true") %>% select(true)
  plot_data <- cbind(predicted_e, oracle_e)

  scatter_plot <- ggplot(data = plot_data,aes(y = prediction, x = true, color=class)) +
    geom_point(alpha = 0.3) +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") + 
    geom_smooth(color="red", se = FALSE, linewidth = 0.7)+
    theme_bw() +
    scale_color_paletteer_d("tayloRswift::midnightsBloodMoon", direction = 1, dynamic = FALSE) +
    labs(
          x = "Predicted Probabilities",
          y = "True Probailities",
          title = paste0("Simulation results of ", c)
        ) +
    xlim(0,1) + ylim(0,1)

plot(scatter_plot)
}
```

# Acharky 2023

### Import simulation results

```{r}
dataname <- "Acharky_2023"
n_rep <- 100

sim_results <- readRDS(paste0("sim_results/simulation_results_ovo_", dataname,"_", n_rep, ".rds"))
```

### Compute average evaluation metrics

The predicted propensity score are evaluated using the Brier score and the logistic loss. As for the semi-synthetic dataset the ground truth of the propensity score is not know, the RMSE cannot be calculated.

```{r}
bs_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
ll_mat <- matrix(NA, nrow = n_rep, ncol = length(classifiers), dimnames = list(NULL, classifiers))
n=length(sim_results$outcome[[1]]$W)
for (c in seq_along(classifiers)) {
  classifier <- classifiers[c]
  for (i in 1:n_rep){
    bs <- brier_score(sim_results[[classifier]][[i]], sim_results[["outcome"]][[i]]$W, binary = FALSE)
    bs_mat[i, c] <- bs
    
    ll <- cross_entropy(sim_results[[classifier]][[i]], sim_results[["outcome"]][[i]]$W)
    ll_mat[i, c] <- ll
  }
}
```

### Reshape and aggregate results

1.  Brier score

```{r}
bs_long <- bs_mat %>%
  as.data.frame() %>%
  pivot_longer(cols = everything())  %>%
  # Separate the 'name' into 'classifier' and 'version' columns
  mutate(version = case_when(
           str_detect(name, "^ovr_") ~ "one-vs-rest",
           str_detect(name, "^ovo_") ~ "one-vs-one",
           TRUE ~ "multi-class"
         ),
         name = gsub("ovr_|ovo_", "", name)
         )
bs_wide <- bs_long %>%
  group_by(version, name) %>% 
  summarise(value = round(mean(value), 4)) %>% 
  spread(key = version, value = value) 

kable(bs_wide, caption = "Brier Score")
```

2.  Logistic loss

```{r}
ll_long <- ll_mat %>%
  as.data.frame() %>%
  pivot_longer(cols = everything())  %>%
  # Separate the 'name' into 'classifier' and 'version' columns
  mutate(version = case_when(
           str_detect(name, "^ovr_") ~ "one-vs-rest",
           str_detect(name, "^ovo_") ~ "one-vs-one",
           TRUE ~ "multi-class"
         ),
         name = gsub("ovr_|ovo_", "", name)
         ) 
ll_wide <- ll_long %>%
  group_by(version, name) %>% 
  summarise(value = round(mean(value), 4)) %>% 
  spread(key = version, value = value) 

kable(ll_wide, caption = "Log Loss")
```

Export the final results.

```{r}
# Write bs_wide data frame as an Excel file
write.xlsx(bs_wide, file = paste0("sim_results/results_bs_", dataname, ".xlsx"))
```

### Visualization

```{r}
K <- length(unique(sim_results$outcome[[1]]$W))
BS_oracle <- (((1/K)-1)^2 + (K-1)*((1/K)^2))/K
LL_unif <- -(log(1/K) + log(1-(1/K)))

plot3 <- ggplot(bs_long, aes(x = value, y = fct_rev(name), fill = version)) +
  geom_density_ridges(alpha = 0.9, show.legend = FALSE) +
  geom_vline(xintercept = BS_oracle, color="black", linetype = "dashed") +
  scale_fill_paletteer_d("tayloRswift::midnightsBloodMoon", direction = 1, dynamic = FALSE) +
  labs(y="", x="Brier Score", fill="") +
  theme_bw() +
  scale_x_continuous(breaks = c(0.071, 0.0715, 0.072), limits = c(0.0708, 0.0721)) +
  scale_y_discrete(labels = rev(classifiers_clean)) +
  facet_wrap(vars(version))
plot3
ggsave(plot3, width = 250, height = 125, units = "mm",
       filename = "plots/dens3.png")

```

```{r}
for (c in classifiers){
  plot(make_calibtation_plot(probabilities = sim_results[[c]][[1]], outcome = sim_results$outcome[[1]]$W, method = c))
}
```
